"""Global ESG FEATURES SAMPLE"""
'''SAMPLE SMALLER AS ESG DATA BEGINS END OF 2001
ESG SAMPLE -
20,711 observations
619 firms
14,592 from developed nations - 17 countries
6,119 from developing nations - 19 countries
NON-ESG SAMPLE
28,035 observations
825 firms'''
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('bmh')
os.chdir('/Users/lockiemichalski/Documents/Files_Mac_before_clean/Desktop/UQ/RQ3_2020/Credit_rating_WRDS/Steps_sample/Files to upload rand/Global_data/Draft2_data')
data=pd.read_csv('GLOBAL_SAMPLE_ESG_ONLY.csv') # data
data = data[data['loc']!='US']
data.sort_values(by='rating_rank', inplace=True) # sort S&P rating feature
"WITHOUT FINANCIALS DISTRIBUTION"
plt.figure(figsize = (20, 10), dpi = 200)
sns.countplot(x='Issuer Rating', data=data).set_title('All-classes')
# class C is not in graph as there are only 2 observations and for classifiers to run this has to removed.
'''average % of non-missing values for countries in sample'''
#start from col 22
non_missing_df = pd.DataFrame() # empty df
non_missing_df[list(data.iloc[:,26:].columns)]=data.groupby('loc')[list(data.iloc[:,26:].columns)]\
.apply(lambda x: x.notnull().sum()/len(x)*100) # calc non-missing for each feature
non_missing_df['average_count']=non_missing_df.mean(axis=1)
#non-missing df excluding ESG features with missing data - refer to sum stats GLOBAL csv file
''' 421 ESG features '''
'''Referred excel file Global sum stats '''
ESG_sum_stats = pd.read_csv('GLOBAL_ESG_sum_stats.csv')
'''Only 30 features have <5% missing values'''
ESG_30_cols = list(ESG_sum_stats.iloc[:,1:30].columns)
data_cols = list(data.iloc[:,26:183].columns) # data columns
cols = data_cols + ESG_30_cols # combine ESG cols 30 features with other data in list
dropped_ESG_features = pd.DataFrame() # empty df
dropped_ESG_features[cols]=data.groupby('loc')[cols]\
.apply(lambda x: x.notnull().sum()/len(x)*100) # calc non-missing for each feature
dropped_ESG_features['average_count']=dropped_ESG_features.mean(axis=1)
'''Plot non-missing average count for each country'''
plot = non_missing_df['average_count'].reset_index()
plot.sort_values(by='average_count', ascending=False, inplace=True)
plt.figure(figsize=(16,8))
plt.bar(plot['loc'],plot['average_count'])
plt.ylabel('% non-missing values')
plt.title('Non-missing values by country')
plt.yticks(np.arange(10, 110, 10))
plt.xlabel('Country')
'''Large majority of the missing values are from ESG features not having any data'''
'''Plot non-missing average count for each country'''
plot = dropped_ESG_features['average_count'].reset_index()
plot.sort_values(by='average_count', ascending=False, inplace=True)
plt.figure(figsize=(16,8))
plt.bar(plot['loc'],plot['average_count'])
plt.ylabel('% non-missing values')
plt.title('Non-missing values by country with ESG features < 5% missing data')
plt.yticks(np.arange(10, 110, 10))
plt.xlabel('Country')
"Function to plot the missing values"
def missing_value_plot(df,title):
null_counts = df.isnull().sum()/len(df)
null_counts = null_counts.sort_values()
plt.figure(figsize=(16,8))
plt.xticks(np.arange(len(null_counts))+0.5,null_counts.index,rotation='vertical')
plt.ylabel('Fraction of rows with missing data (%)')
plt.bar(np.arange(len(null_counts)),null_counts)
plt.title(title)
plt.xlabel('Features')
missing_value_plot(data.iloc[:,31:89], 'Financial ratios') # financial ratios missing data + 3 categorical industry
'''note:
capeiq approx 18.75% - needs 5y of rolling data for each firm - some firms don't have this in the data
cash_conversion, inv_turn, inv_actq all have a numerator or denominator = 0. Those missing values are simply
undefined values. What do we do about these?'''
data.rename(columns={'Recommendation - Mean (1-5)':'Rec_Mean',
'Recommendation - Number Of Total':'Rec_Total',
'Recommendation - Median (1-5)':'Rec_Median',
'Recommendation - Low (1-5)':'Rec_Low',
'Recommendation - High (1-5)':'Rec_High',
'Recommendation - Number Of Strong Buy':'Rec_SBuy',
'Recommendation - Number Of Buy': 'Rec_Buy',
'Recommendation - Number Of Hold': 'Rec_Hold',
'Recommendation - Number Of Sell': 'Rec_Sell',
'Recommendation - Number Of Strong Sell': 'Rec_SSell',
'Recommendation - Number Of No Opinion':'Rec_NoOpinion',
'Long Term Growth - Mean':'LTG_Mean',
'Number of Analysts':'Num_Analyst'}, inplace=True)
missing_value_plot(data[['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']], 'Analyst recommendations')
'''NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the
firm having any recommendations. What should we do about this?'''
macro_features = ['balance_on_trade_bop', 'business_confidence', 'cab%gdp', 'central_government_deficit',
'consumer_confidence', 'consumer_spending', 'current_account_balance',
'export_goods_bop', 'export_prices', 'exports_goods_services', 'foreign_trade_balance',
'fx_USD', 'government_bond', 'government_external_debt', 'government_spending',
'gross_fixed_capital_investment', 'import_prices', 'imports_goods_bop',
'imports_goods_services', 'industrial_production', 'international_reserves',
'labour_force_survey', 'merchandise_exports', 'merchandise_imports', 'money_supply_M0',
'money_supply_M1', 'money_supply_M2', 'money_supply_M3', 'overnight_rate', 'population',
'retail_sales', 'sov_outlook_rank','stock_index', 'terms_of_trade',
'unemployment_rate']
missing_value_plot(data[macro_features], 'Macro')
'''Used Eikon and St Louis and missing data is data that I cannot retrieve at all for a country from the data
sources'''
'''Same missing as macro'''
macro_features_growth = [col for col in data if col.endswith('%')]
missing_value_plot(data[macro_features_growth], 'Macro growth')
missing_value_plot(data[ESG_30_cols], 'ESG features <5% missing values') # financial ratios missing data + 3 categorical industry
print(macro_features)
'''Box plot of numeric macro features'''
fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Capitalization'''
fin_capitalization=['capital_ratioq']
fig, ax = plt.subplots(1,2, figsize=(20, 10))
for var, subplot in zip(fin_capitalization, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Efficiency'''
fin_efficiency=['at_turnq','inv_turnq','pay_turnq','rect_turnq','sale_equityq','sale_nwcq']
fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_efficiency, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Financial Soundness'''
fin_finsoundness=['invt_actq','rect_actq','fcf_ocfq','ocf_lctq','cash_debtq',
'cash_ltq','cfmq','short_debtq','profit_lctq',
'curr_debtq','debt_ebitdaq','dltt_beq','int_debtq',
'int_totdebtq','lt_debtq','lt_ppentq']
fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_finsoundness, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Liquidity'''
fin_liquidity=['cash_ratioq','curr_ratioq','quick_ratioq','accrualq']
fig, ax = plt.subplots(2,2, figsize=(30, 30))
for var, subplot in zip(fin_liquidity, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Profitability''' #NO FINANCIALS
fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']
fig, ax = plt.subplots(7,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Solvency'''
fin_solvency = ['de_ratioq','debt_atq','lt_atq','debt_capitalq','intcovq','intcov_ratioq']
fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_solvency, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Valuation'''
fin_valuation = ['dprq','bmq','capeiq','evmq','pcfq','psq','ptbq']
fig, ax = plt.subplots(4,2, figsize=(30, 30))
for var, subplot in zip(fin_valuation, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - other'''
fin = ['USD_mktcap','USD_price_adj']
fig, ax = plt.subplots(1,3, figsize=(30, 10))
for var, subplot in zip(fin, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
''''no more than 5% missing data is in 30 of the features for global sample'''
ESG_30_cols = list(ESG_sum_stats.iloc[:,1:30].columns)
print(ESG_30_cols)
fig, ax = plt.subplots(10,3, figsize=(30, 40))
for var, subplot in zip(ESG_30_cols, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
# get the distribution of a feature per class with Yeo-Johnson transformation
from scipy.stats import anderson
from scipy.stats import kstest
#from scipy.stats import zscore
from scipy.stats import yeojohnson
import scipy.stats as st
'''Interested in each distribution by each class'''
def get_best_distribution_groupby(data,rating):
dist_names = ["norm", "lognorm","exponweib", "weibull_max",
"weibull_min", "pareto", "genextreme","logistic"]
dist_results = []
params = {}
for dist_name in dist_names:
dist = getattr(st, dist_name)
param = dist.fit(data)
params[dist_name] = param
# Applying the Kolmogorov-Smirnov test
D, p = kstest(data, dist_name, args=param)
dist_results.append((dist_name, p))
# select the best fitted distribution
best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
# store the name of the best fit and its p value
print("Best fitting distribution " +str(rating)+": " +str(best_dist))
return best_dist
def dist_by_class(class_col, feature_col, df):
df = df[[class_col,feature_col]]
unique_ir = df[class_col].unique()
#rating_zscores = []
rating_yeojohnson = []
rating_dist = []
for rating in unique_ir:
data = df[df[class_col]==rating]
#z = zscore(data[feature_col].dropna())
xt, lmbda = yeojohnson(np.array(data[feature_col].dropna()))
if len(xt) == 0:
print('No obs')
else:
#rating_zscores.append(xt)
rating_yeojohnson.append(xt)
print(str(len(xt)) + " length without outliers")
"""For anderson:
dist must be 'norm', 'expon', 'gumbel', 'extreme1' or 'logistic'."""
and_darl = anderson(xt, 'norm') #and_darl
#print("and_darl_test: " + str(and_darl))
best_dist = get_best_distribution_groupby(xt,rating) #kstest
rating_dist.append(best_dist)
fig, ax = plt.subplots(5,5,
figsize=(30,30))
for rating,rating_dist,unique_rating,subplot in zip(rating_yeojohnson,rating_dist,
list(unique_ir), ax.flatten()):
sns.distplot(rating, ax=subplot).set_title(str(unique_rating)+' '+str(rating_dist))
dist_by_class('Issuer Rating', 'ESG Combined Score', data)